#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copied verbatim from https://github.com/soskek/bookcorpus

# Requirements:
#
# beautifulsoup4>=4.6.3
# html2text>=2018.1.9
# blingfire>=0.0.9
# progressbar>=2.5
# lxml>=4.3.2

import os
import sys
import urllib
try:
    from urllib import unquote
except:
    from urllib.parse import unquote
import zipfile

import xml.parsers.expat
import html2text
from glob import glob
from pprint import pprint as pp


class ContainerParser():
    def __init__(self, xmlcontent=None):
        self.rootfile = ""
        self.xml = xmlcontent

    def startElement(self, name, attributes):
        if name == "rootfile":
            self.buffer = ""
            self.rootfile = attributes["full-path"]

    def parseContainer(self):
        parser = xml.parsers.expat.ParserCreate()
        parser.StartElementHandler = self.startElement
        parser.Parse(self.xml, 1)
        return self.rootfile


class BookParser():
    def __init__(self, xmlcontent=None):
        self.xml = xmlcontent
        self.title = ""
        self.author = ""
        self.inTitle = 0
        self.inAuthor = 0
        self.ncx = ""

    def startElement(self, name, attributes):
        if name == "dc:title":
            self.buffer = ""
            self.inTitle = 1
        elif name == "dc:creator":
            self.buffer = ""
            self.inAuthor = 1
        elif name == "item":
            if attributes["id"] == "ncx" or attributes["id"] == "toc" or attributes["id"] == "ncxtoc":
                self.ncx = attributes["href"]

    def characters(self, data):
        if self.inTitle:
            self.buffer += data
        elif self.inAuthor:
            self.buffer += data

    def endElement(self, name):
        if name == "dc:title":
            self.inTitle = 0
            self.title = self.buffer
            self.buffer = ""
        elif name == "dc:creator":
            self.inAuthor = 0
            self.author = self.buffer
            self.buffer = ""

    def parseBook(self):
        parser = xml.parsers.expat.ParserCreate()
        parser.StartElementHandler = self.startElement
        parser.EndElementHandler = self.endElement
        parser.CharacterDataHandler = self.characters
        parser.Parse(self.xml, 1)
        return self.title, self.author, self.ncx


class NavPoint():
    def __init__(self, id=None, playorder=None, level=0, content=None, text=None):
        self.id = id
        self.content = content
        self.playorder = playorder
        self.level = level
        self.text = text


class TocParser():
    def __init__(self, xmlcontent=None):
        self.xml = xmlcontent
        self.currentNP = None
        self.stack = []
        self.inText = 0
        self.toc = []

    def startElement(self, name, attributes):
        # TODO: what to do when no navpoints? Example: https://imgur.com/gAWuSaf
        if name == "navPoint":
            level = len(self.stack)
            self.currentNP = NavPoint(
                attributes["id"], attributes["playOrder"], level)
            self.stack.append(self.currentNP)
            self.toc.append(self.currentNP)
        elif name == "content":
            self.currentNP.content = unquote(attributes["src"])
        elif name == "text":
            self.buffer = ""
            self.inText = 1

    def characters(self, data):
        if self.inText:
            self.buffer += data

    def endElement(self, name):
        if name == "navPoint":
            self.currentNP = self.stack.pop()
        elif name == "text":
            if self.inText and self.currentNP:
                self.currentNP.text = self.buffer
            self.inText = 0

    def parseToc(self):
        parser = xml.parsers.expat.ParserCreate()
        parser.StartElementHandler = self.startElement
        parser.EndElementHandler = self.endElement
        parser.CharacterDataHandler = self.characters
        parser.Parse(self.xml, 1)
        return self.toc


class epub2txt():
    def __init__(self, epubfile=None):
        self.epub = epubfile

    def convert(self):
        # print "Processing %s ..." % self.epub
        file = zipfile.ZipFile(self.epub, "r")
        rootfile = ContainerParser(
            file.read("META-INF/container.xml")).parseContainer()
        title, author, ncx = BookParser(file.read(rootfile)).parseBook()
        ops = "/".join(rootfile.split("/")[:-1])
        if ops != "":
            ops = ops+"/"
        toc = TocParser(file.read(ops + ncx)).parseToc()

        file_order = []
        files = {}
        for t in toc:
          xmlfile = t.content.split("#")[0]
          if xmlfile not in files:
            file_order.append(xmlfile)
            html = file.read(ops + xmlfile).decode("utf-8")
            files[xmlfile] = html

        if False:
          for t in toc:
              chapter = file_order.index(xmlfile) + 1
              sys.stderr.write(str([t.content, {'id': t.id, 'level': t.level, 'playorder': t.playorder, 'text': t.text}]) + '\n')
              sys.stderr.flush()
              anchor = t.content.split("#", 1)[1] if '#' in t.content else ''
              xmlfile = t.content.split("#")[0]
              soup = files[xmlfile]
              if isinstance(soup, str):
                import bs4
                files[xmlfile] = bs4.BeautifulSoup(soup, features="lxml")
                soup = files[xmlfile]
              if len(anchor) > 0:
                el = soup.find(id=anchor)
                assert el is not None
                # pre = soup.new_tag("pre")
                # s = "*"*(t.level+1) + " " + t.text+"\n"
                # s += t.text+"{{{%d\n" % (t.level+1)
                # pre.string = s
                # #el.insert_before(pre) # nah, don't do this
                heading = 'Chapter %d.%d.%d: %s' % (chapter, t.level+1, int(t.playorder), t.text)
                tag = soup.new_tag("h%d" % (t.level+1))
                tag.string = heading
                el.insert_before(tag)

        content = []
        for xmlfile in file_order:
          soup = files[xmlfile]
          html = str(soup)
          h = html2text.HTML2Text()
          h.body_width = 0
          text = h.handle(html)
          content.append(text + "\n")

        file.close()
        result = ''.join(content)
        # final postprocessing fixups: tables come out all weird, so
        # fix them with a hack.
        result = result.replace('\n\n| \n\n', ' | ')
        return result


if __name__ == "__main__":
    args = sys.argv[1:]
    infile = args[0]
    outfile = args[1] if len(args) > 1 else '-'
    filenames = glob(infile) if '*' in infile else [infile]
    out = None
    for filename in filenames:
        txt = epub2txt(filename).convert()
        if len(txt.strip()) > 0:
          if out is None:
            out = open(outfile, "w") if outfile is not '-' else sys.stdout
          out.write(txt)
          out.flush()

# TODO: Look into this bug: https://i.imgur.com/zDora9Y.png
